Munge data

# Load libraries
library(readr)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringr)
library(ggplot2)
library(gridExtra)
library(scales)
library(Cairo)
library(grid)
library(vcd)
library(countrycode)

# Load data and add labels
# TODO: Use non-unicode-mangled data
load(file.path(PROJHOME, "data_raw", "responses_orgs.RData"))
load(file.path(PROJHOME, "data_raw", "responses_countries.RData"))
responses.orgs.labs <- read_csv(file.path(PROJHOME, "data_raw", "response_orgs_labels.csv"))
responses.countries.labs <- read_csv(file.path(PROJHOME, "data_raw", "response_countries_labels.csv"))
 
Hmisc::label(responses.orgs, self=FALSE) <- responses.orgs.labs$varlabel
Hmisc::label(responses.countries, self=FALSE) <- responses.countries.labs$varlabel

# Add a nicer short ID to each response to reference in the article
set.seed(1234)
nicer.ids <- data_frame(survey.id = responses.orgs$survey.id) %>%
  mutate(clean.id = 1000 + sample(1:n()))
write_csv(nicer.ids, path=file.path(PROJHOME, "data", "id_lookup_WILL_BE_OVERWRITTEN.csv"))

The data is split into two sets: responses.orgs has a row for each surveyed organization and responses.countries has a row for each country organizations responded for (1-4 countries per organization). For ease of analysis, this combines them into one larger dataframe (so organization-level data is repeated). It also removes columns that were added manually, where an RA coded whether a country was mentioned in different questions (with a colum for each country!).

responses.all <- responses.orgs %>% 
  left_join(responses.countries, by="survey.id") %>%
  select(-contains("_c", ignore.case=FALSE)) %>%  # Get rid of all the dummy vars
  left_join(nicer.ids, by="survey.id") %>%
  select(survey.id, clean.id, everything())

Convert some responses into numeric indexes:

importance <- data_frame(Q3.19 = levels(responses.countries$Q3.19),
                         importance = c(2, 1, 0, NA))
importance.levels <- data_frame(start=c(0, 1, 2),
                                end=c(1, 2, 3),
                                level=c("Not important", "Somewhat important", 
                                        "Most important"),
                                level.ordered=factor(level, levels=level, ordered=TRUE))

positivity <- data_frame(Q3.25 = levels(responses.countries$Q3.25),
                         positivity = c(-1, 1, 0, NA))
improvement <- data_frame(Q3.26 = levels(responses.countries$Q3.26),
                          improvement = c(1, 0, -1, NA))

# Cho data
tip.change <- read_csv(file.path(PROJHOME, "data", "policy_index.csv")) %>%
  group_by(countryname) %>%
  summarise(avg_tier = mean(tier, na.rm=TRUE),
            change_tip = -(last(na.omit(tier), default=NA) - 
                             first(na.omit(tier), default=NA)),
            change_policy = last(na.omit(p), default=NA) - 
              first(na.omit(p), default=NA)) %>%
  mutate(cow = countrycode(countryname, "country.name", "cown"))

# Funding
funding.raw <- read_csv(file.path(PROJHOME, "data_raw", "funding_clean.csv")) %>%
  mutate(cowcode = ifelse(country == "Serbia", 555, cowcode),
         countryname = countrycode(cowcode, "cown", "country.name"),
         countryname = ifelse(cowcode == 555, "Serbia", countryname)) %>%
  filter(!is.na(countryname)) 

funding.all <- funding.raw %>%
  group_by(countryname) %>%
  summarise(total.funding = sum(amount, na.rm=TRUE),
            avg.funding = mean(amount, na.rm=TRUE)) 

funding.ngos <- funding.raw %>%
  filter(recipient_type %in% c("NGO", "NPO")) %>%
  group_by(countryname) %>%
  summarise(total.funding.ngos = sum(amount, na.rm=TRUE),
            avg.funding.ngos = mean(amount, na.rm=TRUE)) 

responses.full <- responses.all %>%
  left_join(tip.change, by=c("work.country" = "countryname")) %>%
  left_join(funding.all, by=c("work.country" = "countryname")) %>%
  left_join(funding.ngos, by=c("work.country" = "countryname")) %>%
  left_join(positivity, by = "Q3.25") %>%
  left_join(importance, by = "Q3.19") %>%
  left_join(improvement, by = "Q3.26") %>%
  mutate(received.funding = ifelse(Q3.18_3 != 1 | is.na(Q3.18_3), FALSE, TRUE),
         us.involvement = ifelse(Q3.18_5 != 1 | is.na(Q3.18_5), TRUE, FALSE),
         Q3.19 = factor(Q3.19, levels=c("Most important actor", 
                                        "Somewhat important actor", 
                                        "Not an important actor", 
                                        "Don't know"),
                        ordered=TRUE),
         Q3.25_collapsed = ifelse(Q3.25 == "Negative", NA, Q3.25))

country.indexes <- responses.full %>%
  filter(!is.na(work.country)) %>%
  group_by(work.country) %>%
  # Needs mutate + mutate_each + select + unique because you can't mix 
  # summarise + summarise_each. See http://stackoverflow.com/a/31815540/120898
  mutate(num.responses = n()) %>%
  mutate_each(funs(score = mean(., na.rm=TRUE), stdev = sd(., na.rm=TRUE), 
                   n = sum(!is.na(.))),
              c(positivity, importance, improvement)) %>%
  select(work.country, num.responses, matches("positivity_|importance_|improvement_")) %>%
  unique %>%
  ungroup() %>%
  arrange(desc(num.responses))


# Useful functions
theme_clean <- function(base_size=9, base_family="Source Sans Pro Light") {
  ret <- theme_bw(base_size, base_family) + 
    theme(panel.background = element_rect(fill="#ffffff", colour=NA),
          axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
          title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
          panel.border = element_blank(), axis.line=element_blank(),
          panel.grid.minor=element_blank(),
          panel.grid.major.y = element_blank(),
          panel.grid.major.x = element_line(size=0.25, colour="grey90"),
          axis.ticks=element_blank(),
          legend.position="bottom", 
          axis.title=element_text(size=rel(0.8), family="Source Sans Pro Semibold"),
          strip.text=element_text(size=rel(0.9), family="Source Sans Pro Semibold"),
          strip.background=element_rect(fill="#ffffff", colour=NA),
          panel.margin=unit(1, "lines"), legend.key.size=unit(.7, "line"),
          legend.key=element_blank())
  
  ret
}

# Return a data frame of counts and proportions for multiple responses
separate.answers.summary <- function(df, cols, labels, total=FALSE) {
  cols.to.select <- which(colnames(df) %in% cols)
  
  denominator <- df %>%
    select(cols.to.select) %>%
    mutate(num.answered = rowSums(., na.rm=TRUE)) %>%
    filter(num.answered > 0) %>%
    nrow()
  
  df <- df %>%
    select(survey.id, cols.to.select) %>%
    gather(question, value, -survey.id) %>%
    mutate(question = factor(question, labels=labels, ordered=TRUE)) %>%
    group_by(question) %>%
    summarize(response = sum(value, na.rm=TRUE), 
              pct = round(response / denominator * 100, 2),
              plot.pct = response / denominator)
  
  colnames(df) <- c("Answer", "Responses", "%", "plot.pct")
  
  if (total) {
    df <- df %>% select(1:3)
    df <- rbind(as.matrix(df), c("Total responses", denominator, "—"))
  }
  
  return(list(df=df, denominator=denominator))
}

# Create a character vector of significance stars
add.stars <- function(x) {
  as.character(symnum(x, corr = FALSE,
                      cutpoints = c(0,  .001,.01,.05, .1, 1),
                      symbols = c("***","**","*","."," ")))
}

NGO opinions of US activity

# Select just the columns that have cowcodes embedded in them
active.embassies.raw <- responses.countries %>%
  select(contains("_c", ignore.case=FALSE)) %>%
  mutate_each(funs(as.numeric(levels(.))[.]))  # Convert values to numeric

# Select only the rows where they responded (i.e. not all columns are NA)
num.responses <- active.embassies.raw %>%
  rowwise() %>% do(all.missing = all(!is.na(.))) %>%
  ungroup() %>% mutate(all.missing = unlist(all.missing)) %>%
  summarise(total = sum(all.missing))

# Tidy cowcode columns and summarize most commonly mentioned countries
active.embassies <- active.embassies.raw %>%
  gather(country.raw, num) %>%
  group_by(country.raw) %>% summarise(num = sum(num, na.rm=TRUE)) %>%
  mutate(country.raw = str_replace(country.raw, "Q.*c", ""),
         country = countrycode(country.raw, "cown", "country.name"),
         country = ifelse(country.raw == "2070", "European Union", country)) %>%
  ungroup() %>% mutate(prop = num / num.responses$total,
                       prop.nice = sprintf("%.1f%%", prop * 100))

Which embassies or foreign governments NGOs were reported as active partners in the fight against human trafficking?

active.embassies.top <- active.embassies %>%
  arrange(num) %>% select(-country.raw) %>%
  filter(num > 10) %>%
  mutate(country = factor(country, levels=country, ordered=TRUE)) %>%
  arrange(desc(num))

active.embassies %>% arrange(desc(num)) %>% 
  select(-country.raw) %>% filter(num > 10)
## Source: local data frame [12 x 4]
## 
##      num        country       prop prop.nice
##    (dbl)          (chr)      (dbl)     (chr)
## 1    260  United States 0.77380952     77.4%
## 2     43 United Kingdom 0.12797619     12.8%
## 3     35    Netherlands 0.10416667     10.4%
## 4     30         France 0.08928571      8.9%
## 5     26         Norway 0.07738095      7.7%
## 6     26 European Union 0.07738095      7.7%
## 7     24    Switzerland 0.07142857      7.1%
## 8     21         Sweden 0.06250000      6.2%
## 9     19      Australia 0.05654762      5.7%
## 10    17        Germany 0.05059524      5.1%
## 11    17          Italy 0.05059524      5.1%
## 12    12         Canada 0.03571429      3.6%
nrow(active.embassies)  # Number of countries mentioned
## [1] 64
num.responses$total  # Total responses
## [1] 336
# Most active embassies
# Save Q3.7 to a CSV for hand coding
most.active <- responses.countries %>%
  select(Q3.7) %>%
  filter(!is.na(Q3.7))
write_csv(most.active, path=file.path(PROJHOME, "data", 
                                      "most_active_WILL_BE_OVERWRITTEN.csv"))

# Read in hand-coded CSV
if (file.exists(file.path(PROJHOME, "data", "most_active.csv"))) {
  most.active <- read_csv(file.path(PROJHOME, "data", "most_active.csv"))
} else {
  stop("data/most_active.csv is missing")
}

# Split comma-separated countries, unnest them into multiple rows, and 
# summarize most active countries
most.active.clean <- most.active %>%
  transform(clean = strsplit(clean, ",")) %>%
  unnest(clean) %>%
  mutate(clean = str_trim(clean)) %>%
  group_by(clean) %>%
  summarise(total = n()) %>%
  mutate(prop = total / nrow(most.active),
         prop.nice = sprintf("%.1f%%", prop * 100))

Which countries or embassies have been the most active?

most.active.clean %>% arrange(desc(total))
## Source: local data frame [39 x 4]
## 
##             clean total       prop prop.nice
##             (chr) (int)      (dbl)     (chr)
## 1   United States   187 0.70300752     70.3%
## 2            None    16 0.06015038      6.0%
## 3  European Union    14 0.05263158      5.3%
## 4             All    12 0.04511278      4.5%
## 5       Australia     7 0.02631579      2.6%
## 6           Italy     7 0.02631579      2.6%
## 7     Switzerland     7 0.02631579      2.6%
## 8  United Kingdom     7 0.02631579      2.6%
## 9          Norway     6 0.02255639      2.3%
## 10    Philippines     6 0.02255639      2.3%
## ..            ...   ...        ...       ...
nrow(most.active.clean) - 1  # Subtract one because of "None"s
## [1] 38

Over the last 10–15 years, has the United States or its embassy been active in the fight against human trafficking in X?

responses.countries$Q3.8 %>% table %>% print %>% prop.table
## .
##         No        Yes Don't know 
##         39        343        149
## .
##         No        Yes Don't know 
## 0.07344633 0.64595104 0.28060264

Side-by-side graph of active countries + most active countries

plot.data <- active.embassies.top %>%
  bind_rows(data_frame(num=0, country=c("All", "None"), 
                       prop=0, prop.nice="")) %>%
  arrange(num) %>%
  mutate(country = factor(country, levels=country, ordered=TRUE))

plot.data.active <- most.active.clean %>%
  filter(clean %in% plot.data$country) %>%
  mutate(country = factor(clean, levels=levels(plot.data$country), ordered=TRUE))

fig.active <- ggplot(plot.data, aes(x=country, y=num)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label = prop.nice), size=3.5, hjust=1.3, 
            family="Source Sans Pro Light") + 
  labs(x=NULL, y="Number of times country was mentioned as a partner in anti-TIP work") + 
  scale_y_continuous(breaks=seq(0, max(active.embassies$num), by=25), 
                     trans="reverse", expand = c(.1, .1)) + 
  coord_flip() + 
  theme_clean() + 
  theme(axis.text.y = element_blank(), 
        axis.line.y = element_blank(),
        plot.margin = unit(c(1,0.5,1,1), "lines"))

fig.most.active <- ggplot(plot.data.active, aes(x=country, y=total)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label = prop.nice), size=3.5, hjust=-0.3, 
            family="Source Sans Pro Light") + 
  labs(x=NULL, y="Number of times country was mentioned as the most active partner in anti-TIP work") + 
  scale_y_continuous(expand = c(.15, .15)) + 
  coord_flip() + 
  theme_clean() + 
  theme(axis.text.y = element_text(hjust=0.5), 
        axis.line.y = element_blank(),
        plot.margin = unit(c(1,1,1,0), "lines"))
  
fig.embassies <- arrangeGrob(fig.active, fig.most.active, nrow=1)
grid.draw(fig.embassies)

ggsave(fig.embassies, filename=file.path(PROJHOME, "figures", "fig_embassies.pdf"),
       width=5, height=2, units="in", device=cairo_pdf, scale=2.5)
ggsave(fig.embassies, filename=file.path(PROJHOME, "figures", "fig_embassies.png"),
       width=5, height=2, units="in", scale=2.5)

Actual US activities

cols <- c("Q3.9_1", "Q3.9_2", "Q3.9_3", "Q3.9_4", "Q3.9_5",
          "Q3.9_6", "Q3.9_7", "Q3.9_8", "Q3.9_9", "Q3.9_10")
labels <- c("Asking for legislation", "Convening conferences or workshops",
            "Raising awareness", "Providing resources or funding",
            "Increasing government attention", "Training government officials",
            "Contributing to a government action plan", "Other", "Don't know",
            "The US has not been involved in trafficking issues")

activities <- separate.answers.summary(responses.countries, cols, labels)
activities$denominator  # Number of responses
## [1] 530
activities$df
## Source: local data frame [10 x 4]
## 
##                                                Answer Responses     %   plot.pct
##                                                (fctr)     (int) (dbl)      (dbl)
## 1                              Asking for legislation       164 30.94 0.30943396
## 2                  Convening conferences or workshops       207 39.06 0.39056604
## 3                                   Raising awareness       213 40.19 0.40188679
## 4                      Providing resources or funding       210 39.62 0.39622642
## 5                     Increasing government attention       216 40.75 0.40754717
## 6                       Training government officials       146 27.55 0.27547170
## 7            Contributing to a government action plan       113 21.32 0.21320755
## 8                                               Other        43  8.11 0.08113208
## 9                                          Don't know        26  4.91 0.04905660
## 10 The US has not been involved in trafficking issues       166 31.32 0.31320755
plot.data <- activities$df %>% 
  mutate(Answer=factor(Answer, levels=rev(labels), ordered=TRUE))

fig.activities <- ggplot(plot.data, aes(x=Answer, y=Responses)) +
  geom_bar(aes(y=plot.pct), stat="identity") + 
  labs(x=NULL, y=NULL) + 
  scale_y_continuous(labels = percent, 
                     breaks = seq(0, max(round(plot.data$plot.pct, 1)), by=0.1)) + 
  coord_flip() + theme_clean()
fig.activities

ggsave(fig.activities, filename=file.path(PROJHOME, "figures", "fig_activities.pdf"),
       width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.activities, filename=file.path(PROJHOME, "figures", "fig_activities.png"),
       width=6.5, height=5, units="in")

NGO opinions of US importance

General importance

plot.data <- responses.full %>%
  group_by(Q3.19) %>%
  summarize(num = n()) %>%
  na.omit() %>%
  mutate(prop = num / sum(num),
         prop.nice = sprintf("%.1f%%", prop * 100),
         Q3.19 = factor(Q3.19, levels=rev(levels(Q3.19)), ordered=TRUE))
plot.data
## Source: local data frame [4 x 4]
## 
##                      Q3.19   num      prop prop.nice
##                     (fctr) (int)     (dbl)     (chr)
## 1     Most important actor   139 0.2673077     26.7%
## 2 Somewhat important actor   180 0.3461538     34.6%
## 3   Not an important actor    68 0.1307692     13.1%
## 4               Don't know   133 0.2557692     25.6%
fig.us_importance <- ggplot(plot.data, aes(x=Q3.19, y=prop)) + 
  geom_bar(stat="identity") + 
  labs(x=NULL, y=NULL) + 
  scale_y_continuous(labels = percent, 
                     breaks = seq(0, max(round(plot.data$num, 1)), by=0.1)) + 
  coord_flip() + theme_clean()
fig.us_importance

ggsave(fig.us_importance, filename=file.path(PROJHOME, "figures", "fig_us_importance.pdf"),
       width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.us_importance, filename=file.path(PROJHOME, "figures", "fig_us_importance.png"),
       width=6.5, height=5, units="in")

Average importance by country

importance.plot <- country.indexes %>%
  filter(num.responses >= 10) %>%
  arrange(importance_score) %>%
  mutate(country_label = factor(work.country, levels=unique(work.country), 
                                labels=paste0(work.country, " (", num.responses, ")"),
                                ordered=TRUE)) 

fig.avg_importance <- ggplot(importance.plot, aes(x=country_label, y=importance_score)) + 
  geom_rect(data=importance.levels, aes(x=NULL, y=NULL, ymin=start, ymax=end, 
                                        xmin=0, xmax=Inf, fill=level.ordered), alpha=0.5) + 
  geom_pointrange(aes(ymax=importance_score + importance_stdev,
                      ymin=importance_score - importance_stdev)) + 
  labs(x="Country (number of responses)", 
       y="Importance of the US in anti-TIP efforts (mean)") + 
  scale_fill_manual(values=c("grey90", "grey60", "grey30"), name=NULL) + 
  coord_flip() + 
  theme_clean() + theme(legend.position="bottom")
fig.avg_importance

ggsave(fig.avg_importance, filename=file.path(PROJHOME, "figures", "fig_avg_importance.pdf"),
       width=6.5, height=5, units="in", device=cairo_pdf)
ggsave(fig.avg_importance, filename=file.path(PROJHOME, "figures", "fig_avg_importance.png"),
       width=6.5, height=5, units="in")

Explaining variation in opinion of US importance

Does opinion of the US vary by:

# * Tier rating (average) or improvement in Cho score?
# * Whether an NGO has received US funding (or where the COUNTRY has received more TIP grants?)
# * Whether an NGO has interacted with the US
# * Whether a country is rich or poor (or some other quality)
# * Whether an NGO focuses on certain types of work?
# * In which countries does the US seem to have had more collaboration with NGOs?

TODO: Explaining variation in opinion of US positivity? (no because censoring)

df.importance <- responses.full %>% 
  select(Q3.19, change_policy, avg_tier, change_tip, change_policy, importance) %>% 
  filter(!is.na(Q3.19)) %>%
  mutate(importance_factor = factor(Q3.19, ordered=FALSE))

Average tier doesn’t show much because it doesn’t show any changes in time—just how bad the country is in general

# http://www.r-bloggers.com/analysis-of-variance-anova-for-multiple-comparisons/
importance.means <- df.importance %>%
  group_by(Q3.19) %>%
  summarize(avg_points = mean(avg_tier, na.rm=TRUE),
            var_points = var(avg_tier, na.rm=TRUE)) %>%
  print
## Source: local data frame [4 x 3]
## 
##                      Q3.19 avg_points var_points
##                     (fctr)      (dbl)      (dbl)
## 1     Most important actor   2.054501  0.1036754
## 2 Somewhat important actor   1.927769  0.2153692
## 3   Not an important actor   1.768449  0.2980403
## 4               Don't know   1.787980  0.2715889

Plot group means and distributions

fig.importance <- ggplot(df.importance, aes(x=Q3.19, y=avg_tier)) +
  geom_violin(fill="grey90") + 
  geom_point(alpha=0.05, show.legend=FALSE) +
  geom_point(data=importance.means, aes(x=Q3.19, y=avg_points), size=5, show.legend=FALSE) + 
  labs(x="Opinion of US importance", y="Average TIP tier rating") + 
  coord_flip() + theme_clean()
fig.importance

Those means appear slightly different from each other. Is that really the case? Check with ANOVA, which assumes homogenous variance across groups. Throw every possible test at it—if null is rejected (p < 0.05 or whatever) then variance is likely heterogenous:

bartlett.test(avg_tier ~ importance_factor, data=df.importance)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  avg_tier by importance_factor
## Bartlett's K-squared = 32.694, df = 3, p-value = 3.737e-07
car::leveneTest(avg_tier ~ importance_factor, data=df.importance)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   3  10.224 1.586e-06 ***
##       454                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fligner.test(avg_tier ~ importance_factor, data=df.importance)  # Uses median
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  avg_tier by importance_factor
## Fligner-Killeen:med chi-squared = 24.984, df = 3, p-value = 1.556e-05
kruskal.test(avg_tier ~ importance_factor, data=df.importance)  # Nonparametric
## 
##  Kruskal-Wallis rank sum test
## 
## data:  avg_tier by importance_factor
## Kruskal-Wallis chi-squared = 16.197, df = 3, p-value = 0.001033

All of those p-values are tiny, so it’s clear that variance is not the same across groups. However, there’s a rule of thumb (super detailed example) that ANOVA is robust to heterogeneity of variance as long as the largest variance is less than four times the smallest variance.

Given that rule of thumb, the variance here isn’t that much of an issue

df.importance %>% group_by(importance_factor) %>%
  summarise(variance = var(avg_tier, na.rm=TRUE)) %>%
  do(data_frame(ratio = max(.$variance) / min(.$variance)))
## Source: local data frame [1 x 1]
## 
##      ratio
##      (dbl)
## 1 2.874743

It would be cool to use Bayesian ANOVA to account for non-homogenous variances (see John Kruschke’s evangelizing), since it handles violations of ANOVA assumptions nicely. However, in his example, the ratio of min/max variance is huge, so it does lead to big differences in results:

#
# read_csv("http://www.indiana.edu/~kruschke/DoingBayesianDataAnalysis/Programs/NonhomogVarData.csv") %>%
#   group_by(Group) %>%
#   summarise(variance = var(Y)) %>%
#   do(data_frame(ratio = max(.$variance) / min(.$variance)))
#   # ratio = 64
#

With the variance issue handled, run the ANOVA

(importance.aov <- aov(avg_tier ~ importance_factor, data=df.importance))
## Call:
##    aov(formula = avg_tier ~ importance_factor, data = df.importance)
## 
## Terms:
##                 importance_factor Residuals
## Sum of Squares            5.40961  93.96314
## Deg. of Freedom                 3       454
## 
## Residual standard error: 0.4549365
## Estimated effects may be unbalanced
## 62 observations deleted due to missingness

There is some significant difference between groups. Look at pairwise comparisons between all the groups to (kind of) decompose that finding

(importance.pairs <- TukeyHSD(importance.aov, "importance_factor"))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = avg_tier ~ importance_factor, data = df.importance)
## 
## $importance_factor
##                                                        diff        lwr          upr     p adj
## Somewhat important actor-Most important actor   -0.12673275 -0.2643637  0.010898224 0.0835093
## Not an important actor-Most important actor     -0.28605296 -0.4776665 -0.094439366 0.0007784
## Don't know-Most important actor                 -0.26652163 -0.4194140 -0.113629236 0.0000522
## Not an important actor-Somewhat important actor -0.15932021 -0.3441336  0.025493209 0.1185914
## Don't know-Somewhat important actor             -0.13978888 -0.2840675  0.004489721 0.0615006
## Don't know-Not an important actor                0.01953133 -0.1769115  0.215974193 0.9940858

Plot the differences

importance.pairs.plot <- data.frame(importance.pairs$importance_factor) %>%
  mutate(pair = row.names(.),
         pair = factor(pair, levels=pair, ordered=TRUE),
         stars = add.stars(p.adj))

fig.importance.pairs <- ggplot(importance.pairs.plot, 
                               aes(x=pair, y=diff, ymax=upr, ymin=lwr)) + 
  geom_hline(yintercept=0) + 
  geom_text(aes(label=stars), nudge_x=0.25) +
  geom_pointrange() + 
  theme_clean() + coord_flip()
fig.importance.pairs

Another way of checking group means in non-homogenous data is to use ordinal logistic regression. Here’s an ordered logit and corresponding predicted probabilities.

model.importance <- ordinal::clm(Q3.19 ~ avg_tier, data=df.importance, 
                                 link="logit", Hess=TRUE)
summary(model.importance)
## formula: Q3.19 ~ avg_tier
## data:    df.importance
## 
##  link  threshold nobs logLik  AIC     niter max.grad cond.H 
##  logit flexible  458  -590.92 1189.83 6(0)  7.09e-08 2.3e+02
## 
## Coefficients:
##          Estimate Std. Error z value Pr(>|z|)    
## avg_tier  -0.8634     0.1813  -4.761 1.92e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##                                                 Estimate Std. Error z value
## Most important actor|Somewhat important actor    -2.6277     0.3703  -7.095
## Somewhat important actor|Not an important actor  -1.0134     0.3512  -2.886
## Not an important actor|Don't know                -0.4322     0.3495  -1.236
## (62 observations deleted due to missingness)
# Predicted probabilities
newdata <- data_frame(avg_tier = seq(0, 3, 0.1))
pred.importance <- predict(model.importance, newdata, interval=TRUE)

# Create plot data
pred.plot.lower <- cbind(newdata, pred.importance$lwr) %>%
  gather(importance, lwr, -c(1:ncol(newdata)))
pred.plot.upper <- cbind(newdata, pred.importance$upr) %>%
  gather(importance, upr, -c(1:ncol(newdata)))

pred.plot.data <- cbind(newdata, pred.importance$fit) %>%
  gather(importance, importance_prob, -c(1:ncol(newdata))) %>%
  left_join(pred.plot.lower, by=c("avg_tier", "importance")) %>%
  left_join(pred.plot.upper, by=c("avg_tier", "importance"))

importance.colors <- c("grey20", "grey40", "grey60", "grey80")
ggplot(pred.plot.data, aes(x=avg_tier, y=importance_prob)) +  
  geom_ribbon(aes(ymax=upr, ymin=lwr, fill=importance), 
              alpha=0.2) + 
  geom_line(aes(colour=importance), size=2) + 
  scale_y_continuous(labels=percent) + 
  labs(x="Average tier rating in country", 
       y="Predicted probability of assigning importance") + 
  # scale_fill_manual(values=importance.colors, name=NULL) + 
  # scale_colour_manual(values=importance.colors, name=NULL) +
  theme_clean()

Change in TIP score is a little more interesting—countries that move the most over time only see positive NGO opinions

ggplot(df.importance, aes(x=Q3.19, y=change_tip, fill=Q3.19)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Change in TIP tier rating") + 
  coord_flip()

Using the Cho score shows a little more variation, but essentially tells the same thing. NGOs that work in countries that have seen the most actual improvement in TIP policies almost unanimously have a positive opinion of the US.

ggplot(df.importance, aes(x=Q3.19, y=change_policy, fill=Q3.19)) +
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Change in TIP policy index") + 
  coord_flip()

NGO opinions of the importance of the US’s work are unrelated to actual changes, though.

ggplot(df.importance, aes(x=Q3.19, y=change_policy, fill=Q3.19)) +
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="US importance", y="Change in TIP policy index") + 
  coord_flip()

Variations in opinion of US based on involvement with the US

General NGO interaction with the US has no bearing on NGO opinions of the US

us.table <- responses.full %>%
  xtabs(~ Q3.25_collapsed + us.involvement, .) %>% print
##                us.involvement
## Q3.25_collapsed FALSE TRUE
##      Don't know     8   28
##      Mixed         16   47
##      Positive      45  168
chisq.test(us.table)
## 
##  Pearson's Chi-squared test
## 
## data:  us.table
## X-squared = 0.51495, df = 2, p-value = 0.773
set.seed(1234)
coindep_test(us.table, n=5000)
## 
##  Permutation test for conditional independence
## 
## data:  us.table
## f(x) = 0.55384, p-value = 0.7598
mosaic(us.table, 
       labeling_args=list(set_varnames=c(us.involvement="US involvement", 
                                         Q3.25_collapsed="Opinion of US"),
                          gp_labels=(gpar(fontsize=8))), 
       gp_varnames=gpar(fontsize=10, fontface=2))

More specifically, the receipt of funding has no bearing on NGO opinions of the US

funding.table <- responses.full %>%
  xtabs(~ Q3.25_collapsed + received.funding, .) %>% print
##                received.funding
## Q3.25_collapsed FALSE TRUE
##      Don't know    26   10
##      Mixed         51   12
##      Positive     146   67
chisq.test(funding.table)
## 
##  Pearson's Chi-squared test
## 
## data:  funding.table
## X-squared = 3.6824, df = 2, p-value = 0.1586
set.seed(1234)
coindep_test(funding.table, n=5000)
## 
##  Permutation test for conditional independence
## 
## data:  funding.table
## f(x) = 1.4085, p-value = 0.1568
mosaic(funding.table, 
       labeling_args=list(set_varnames=c(received.funding="Received US funding", 
                                         Q3.25_collapsed="Opinion of US"),
                          gp_labels=(gpar(fontsize=8))), 
       gp_varnames=gpar(fontsize=10, fontface=2))

NGOs in countries that receive substantial US funding have an opinion of the US, but there’s no difference between positive and mixed opinions. This is true accounting for both average and total funding. Opinions are not driven by cooptation - look at chapter 1 for boomerang type stuff - cooptation by donors - so in this case, the NGOs aren’t just being bought out

ggplot(responses.full, aes(x=Q3.25_collapsed, y=total.funding, fill=Q3.25_collapsed)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Total TIP funding to country") + 
  scale_y_continuous(labels=dollar)

ggplot(responses.full, aes(x=Q3.25, y=avg.funding, fill=Q3.25)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Average TIP funding to country") + 
  scale_y_continuous(labels=dollar)

The same is true if looking just at US funding designated for just NGOs and NPOs

ggplot(responses.full, aes(x=Q3.25_collapsed, y=total.funding.ngos, fill=Q3.25_collapsed)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Total NGO-only TIP funding to country") + 
  scale_y_continuous(labels=dollar)

ggplot(responses.full, aes(x=Q3.25_collapsed, y=avg.funding.ngos, fill=Q3.25_collapsed)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="Opinion of US", y="Average NGO-only TIP funding to country") + 
  scale_y_continuous(labels=dollar)

plot.data <- responses.full %>% select(Q3.19, total.funding) %>% filter(!is.na(Q3.19))
ggplot(plot.data, aes(x=Q3.19, y=total.funding, fill=Q3.19)) + 
  geom_violin() + 
  geom_point(alpha=0.3, position=position_jitterdodge()) + 
  labs(x="US importance", y="Total TIP funding to country") + 
  scale_y_continuous(labels=dollar) +
  coord_flip()

# Type of work
# TODO: work.country is not the most reliable identifier---there be NAs
# Q2.2_X
asdf <- responses.full %>% 
  select(survey.id, matches("Q2.2_\\d$")) %>%
  gather(type_of_work, value, -survey.id) %>%
  mutate(type_of_work = factor(type_of_work, levels=paste0("Q2.2_", seq(1:4)), 
                               labels=c("Organs", "Sex", "Labor", "Other"), 
                               ordered=TRUE))
asdf %>%
  group_by(type_of_work) %>%
  summarise(bloop = n(),
            derp = sum(value, na.rm=TRUE),
            asdf = derp / n())
## Source: local data frame [4 x 4]
## 
##   type_of_work bloop  derp       asdf
##         (fctr) (int) (int)      (dbl)
## 1       Organs   603    44 0.07296849
## 2          Sex   603   522 0.86567164
## 3        Labor   603   363 0.60199005
## 4        Other   603   146 0.24212272
# Should be 30, 408, 294, 116 with 479 total

# Q2.3_
# Q2.4_X


# Opinion of the US vs. importance
# Can't do this because 3.25 is censored by 3.19
responses.full %>% 
  xtabs(~ Q3.25 + Q3.19, .)
##             Q3.19
## Q3.25        Most important actor Somewhat important actor Not an important actor Don't know
##   Don't know                    8                       28                      0          0
##   Mixed                        21                       42                      0          0
##   Negative                      2                        0                      0          0
##   Positive                    107                      106                      0          0
table(responses.full$Q3.25)
## 
## Don't know      Mixed   Negative   Positive 
##         36         63          2        213
table(responses.full$Q3.19)
## 
##     Most important actor Somewhat important actor   Not an important actor               Don't know 
##                      139                      180                       68                      133
sum(table(responses.full$Q3.25))
## [1] 314
sum(table(responses.full$Q3.19))
## [1] 520
responses.full$Q3.19 %>%
  table %>% print %>% prop.table
## .
##     Most important actor Somewhat important actor   Not an important actor               Don't know 
##                      139                      180                       68                      133
## .
##     Most important actor Somewhat important actor   Not an important actor               Don't know 
##                0.2673077                0.3461538                0.1307692                0.2557692
# Find country averages of government improvement, etc. - then show that X number of countries show improvement, etc. 
# Report by organization and by country - how many countries has the US had a positive influence + how many NGOs say the US has had a positive influence


full <- left_join(country.indexes, tip.change,
                  by=c("work.country" = "countryname")) %>%
  filter(num.responses >= 10) %>%
  mutate(country_label = ifelse(num.responses >= 10, work.country, ""))


ggplot(full, aes(x=improvement_score, y=change_policy, label=work.country)) + 
  geom_point() + geom_text(vjust=1.5) +
  geom_hline(yintercept=0) + 
  scale_x_continuous(limits=c(0, 1)) + 
  scale_y_continuous(limits=c(-2, 6))

ggplot(country.indexes, aes(x=work.country, y=improvement_score)) + 
  geom_bar(stat="identity") + 
  coord_flip()

# Compare improvement scores with actual changes in TIP score to get a sense of if NGO experiences reflect changes in rankings

ggplot(country.indexes, aes(x=work.country, y=positivity_score)) + 
  geom_bar(stat="identity") + 
  coord_flip()

responses.countries %>% 
  xtabs(~ Q3.25 + Q3.19, .)
##             Q3.19
## Q3.25        Most important actor Somewhat important actor Not an important actor Don't know
##   Negative                      2                        0                      0          0
##   Positive                    107                      106                      0          0
##   Mixed                        21                       42                      0          0
##   Don't know                    8                       28                      0          0
# Importance opinions
importance.opinions <- responses.all %>%
  filter(Q3.19 == "Not an important actor") %>%
  select(survey.id, clean.id, Q3.19, contains("TEXT"), Q4.1)

responses.all$Q3.19 %>%
  table %>% print %>% prop.table
## .
##     Most important actor Somewhat important actor   Not an important actor               Don't know 
##                      139                      180                       68                      133
## .
##     Most important actor Somewhat important actor   Not an important actor               Don't know 
##                0.2673077                0.3461538                0.1307692                0.2557692
responses.countries %>% 
  xtabs(~ Q3.25 + Q3.26, .)
##             Q3.26
## Q3.25        Improved Remained constant Slowed down Don't know
##   Negative          2                 0           0          0
##   Positive        149                39          22          3
##   Mixed            32                16          13          2
##   Don't know       19                 8           6          3
ggplot(responses.orgs, aes(x = Q1.5.factor)) + geom_bar() + 
  labs(x = Hmisc::label(responses.orgs$Q1.5))

# Importance of US
asdf <- responses.all %>% 
  select(clean.id, Q1.2, Q3.8, Q3.6, Q3.7)

inconsistent.no <- c(1020, 1152, 1226, 1267, 1323, 1405, 1515)
inconsistent.dont.know <- c(1051, 1512)

qwer <- asdf %>%
  mutate(us.active = ifelse(clean.id %in% c(inconsistent.no, inconsistent.dont.know),
                            "Yes", as.character(Q3.8)))

qwer$us.active %>% table %>% print %>% prop.table
## .
## Don't know         No        Yes 
##        147         32        352
## .
## Don't know         No        Yes 
## 0.27683616 0.06026365 0.66290019
sdfg <- qwer %>% group_by(clean.id) %>% 
  summarize(said.no = ifelse(any(us.active == "No", na.rm=TRUE), TRUE, FALSE))
sdfg$said.no %>% table %>% print %>% prop.table
## .
## FALSE  TRUE 
##   491    31
## .
##      FALSE       TRUE 
## 0.94061303 0.05938697
# US importance and positivity



# Importance of report 
responses.orgs$Q2.5 %>% table %>% print %>% prop.table
## .
##  No Yes 
##  67 452
## .
##        No       Yes 
## 0.1290944 0.8709056
responses.countries$Q3.23 %>% table %>% print %>% prop.table
## .
##  No Yes 
## 265 203
## .
##        No       Yes 
## 0.5662393 0.4337607
heard.of.tip <- responses.countries %>% 
  left_join(responses.orgs, by="survey.id") %>%
  filter(Q2.5 == "Yes") %>%
  group_by(survey.id) %>%
  mutate(know.score = ifelse(Q3.22 == "Don't know", FALSE, TRUE)) %>%
  select(know.score) %>% unique

heard.of.tip$know.score %>% table %>% print %>% prop.table
## .
## FALSE  TRUE 
##    98   309
## .
##     FALSE      TRUE 
## 0.2407862 0.7592138
# Opinions of report
opinions <- responses.all %>% 
  select(clean.id, Q1.2, home.country, work.country, Q3.21_1, Q3.21_4_TEXT, Q3.24.Text)

not.used.tip.ids <- c(1094, 1099, 1106, 1114, 1157, 1221, 1244, 1269, 
                      1314, 1330, 1354, 1357, 1393, 1425)
not.used.tip <- responses.all %>%
  mutate(no.response = ifelse(is.na(Q3.21_1) & is.na(Q3.21_2) & 
                                is.na(Q3.21_3) & is.na(Q3.21_4), TRUE, FALSE),
         explicit.no = ifelse(clean.id %in% not.used.tip.ids, TRUE, FALSE)) %>%
  select(clean.id, Q1.2, Q3.21_1, Q3.21_2, Q3.21_3, Q3.21_4, no.response, explicit.no) %>%
  group_by(clean.id) %>%
  summarize(no.response = ifelse(sum(no.response) > 0, TRUE, FALSE),
            explicit.no = ifelse(sum(explicit.no) > 0, TRUE, FALSE))

not.used.tip$explicit.no %>% table
## .
## FALSE  TRUE 
##   508    14
# Does opinion of the US vary by:
# * Tier rating (average) or improvement in Cho score?
# * Whether an NGO has received US funding (or where the COUNTRY has received more TIP grants?)
# * Whether an NGO has interacted with the US
# * Whether a country is rich or poor (or some other quality)
# * Whether an NGO focuses on certain types of work?
# * In which countries does the US seem to have had more collaboration with NGOs?